{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Bagging Regression with Nearest Neighbors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "from sklearn.datasets import load_diabetes\n",
    "\n",
    "diabetes = load_diabetes()\n",
    "\n",
    "X = diabetes.data\n",
    "y = diabetes.target\n",
    "\n",
    "X_feature_names = ['age', 'gender', 'body mass index', 'average blood pressure','bl_0','bl_1','bl_2','bl_3','bl_4','bl_5']\n",
    "\n",
    "#bin target variable for better sampling\n",
    "bins = 50*np.arange(8)\n",
    "binned_y = np.digitize(y, bins)\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=binned_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=3, error_score='raise',\n",
       "          estimator=BaggingRegressor(base_estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "          metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n",
       "          weights='uniform'),\n",
       "         bootstrap=True, bootstrap_features=False, max_features=1.0,\n",
       "         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,\n",
       "         random_state=None, verbose=0, warm_start=False),\n",
       "          fit_params=None, iid=True, n_iter=5, n_jobs=-1,\n",
       "          param_distributions={'max_features': [0.5, 1.0], 'max_samples': [0.5, 1.0], 'oob_score': [True, False], 'n_estimators': [100], 'base_estimator__n_neighbors': [3, 5]},\n",
       "          pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
       "          return_train_score=True, scoring=None, verbose=0)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import BaggingRegressor\n",
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "\n",
    "single_estimator = KNeighborsRegressor()\n",
    "ensemble_estimator = BaggingRegressor(base_estimator = single_estimator)\n",
    "\n",
    "param_dist = {\n",
    " 'max_samples': [0.5,1.0],\n",
    " 'max_features' : [0.5,1.0],\n",
    " 'oob_score' : [True, False],\n",
    " 'base_estimator__n_neighbors': [3,5],\n",
    " 'n_estimators': [100]\n",
    " }\n",
    "\n",
    "pre_gs_inst_bag = RandomizedSearchCV(ensemble_estimator,\n",
    " param_distributions = param_dist,\n",
    " cv=3,\n",
    " n_iter = 5,\n",
    " n_jobs=-1)\n",
    "\n",
    "pre_gs_inst_bag.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'base_estimator__n_neighbors': 5,\n",
       " 'max_features': 1.0,\n",
       " 'max_samples': 0.5,\n",
       " 'n_estimators': 100,\n",
       " 'oob_score': True}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#slightly diferent from book\n",
    "pre_gs_inst_bag.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rs_bag = BaggingRegressor(**{'max_features': 1.0,\n",
    " 'max_samples': 0.5,\n",
    " 'n_estimators': 1000,\n",
    " 'oob_score': True,\n",
    " 'base_estimator': KNeighborsRegressor(n_neighbors=5)})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BaggingRegressor(base_estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "          metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n",
       "          weights='uniform'),\n",
       "         bootstrap=True, bootstrap_features=False, max_features=1.0,\n",
       "         max_samples=0.5, n_estimators=1000, n_jobs=1, oob_score=True,\n",
       "         random_state=None, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rs_bag.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "R-squared 0.498096653258\n",
      "MAE :  44.3642741573\n",
      "MAPE :  0.419361955306\n"
     ]
    }
   ],
   "source": [
    "y_pred = rs_bag.predict(X_test)\n",
    "\n",
    "from sklearn.metrics import r2_score, mean_absolute_error\n",
    "\n",
    "print \"R-squared\",r2_score(y_test, y_pred)\n",
    "print \"MAE : \",mean_absolute_error(y_test, y_pred)\n",
    "print \"MAPE : \",(np.abs(y_test - y_pred)/y_test).mean()"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
